In [1]:
%env CUDA_VISIBLE_DEVICES=1 # limit GPU usage, if any to this GPU
In [2]:
import numpy as np
from classifier import common
import os
labels = common.fetch_samples()
from sklearn.model_selection import train_test_split
np.random.seed(123)
y_train, y_test, sha256_train, sha256_test = train_test_split(
list(labels.values()), list(labels.keys()), test_size=1000)
So, let's move to "real" end-to-end deep learning, because deep learning does everything better, right? For this, there's a few things to note.
But, think of it! End-to-end deep learing for static malware detection. No PE parsing required! No feature engineering required! No work required! Right?
You can find code that defines the end-to-end model architecture at classifier/endtoend.py.
In [3]:
# for this demo, will slurp in only the first 256K (2**18) bytes of the file
# for a nice GPU like a Titan X, you should be able to squeeze in > 2MB ...
# ...but warning! this makes training more difficult...larger haystack to find needles
max_file_length = int(2**18) # powers of 2 FTW
file_chunks = 8 # break file into this many chunks
file_chunk_size = max_file_length // file_chunks
batch_size = 8
In [ ]:
# That this is very long running cell, and we're going
# it may appear that the output is truncated before training completes
# let's train this puppy
from classifier import endtoend
import math
from keras.callbacks import LearningRateScheduler, EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
# create_model(input_shape, byte_embedding_size=2, input_dropout=0.05, hidden_dropout=0.05, kernel_size=16, n_filters_per_layer=[64,256,1024], n_mlp_layers=2 )
model_e2e = endtoend.create_model(input_shape=(file_chunks, file_chunk_size))
train_generator = common.generator(list(zip(sha256_train, y_train)), batch_size, file_chunks, file_chunk_size)
test_generator = common.generator(list(zip(sha256_test, y_test)), 1, file_chunks, file_chunk_size)
training_history = model_e2e.fit_generator(train_generator,
steps_per_epoch=math.ceil(len(sha256_train) / batch_size),
epochs=100,
callbacks=[
EarlyStopping( patience=10 ),
ModelCheckpoint( 'endtoend.h5', save_best_only=True),
ReduceLROnPlateau( patience=5)],
validation_data=test_generator,
validation_steps=len(sha256_test))
Notice that the output above is truncated, because Jupyter notebook client couldn't muster the patience to wait for all the output coming from the kernel. Got bored. Moved along. (shakes fist). Millennials!
In [4]:
from keras.models import load_model
# we'll load the "best" model (in this case, the penultimate model) that was saved
# by our ModelCheckPoint callback
model_e2e = load_model('endtoend.h5')
# we could load the "best" model, but in this case, the "best" model is the penultimate, and not much better
# than the model we have in hand
y_pred = []
for sha256, lab in zip(sha256_test, y_test):
y_pred.append(
model_e2e.predict_on_batch(
np.asarray([common.get_file_data(sha256, lab, max_file_length)]).reshape(
(-1, file_chunks, file_chunk_size))
)
)
common.summarize_performance(np.asarray(y_pred).flatten(), y_test, "End-to-end convnet")
Out[4]:
In [ ]: